Import Essentials¶

In [13]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
In [14]:
df= pd.read_csv('diamonds.csv')

Data Overview¶

In [15]:
df.head()
Out[15]:
Unnamed: 0 carat cut color clarity depth table price x y z
0 1 0.23 Ideal E SI2 61.5 55.0 326 3.95 3.98 2.43
1 2 0.21 Premium E SI1 59.8 61.0 326 3.89 3.84 2.31
2 3 0.23 Good E VS1 56.9 65.0 327 4.05 4.07 2.31
3 4 0.29 Premium I VS2 62.4 58.0 334 4.20 4.23 2.63
4 5 0.31 Good J SI2 63.3 58.0 335 4.34 4.35 2.75

Dropping Unnamed: 0 column because we don`t need it¶

In [16]:
df.drop('Unnamed: 0',axis=1, inplace=True)

EDA¶

In [17]:
df.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 53940 entries, 0 to 53939
Data columns (total 10 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   carat    53940 non-null  float64
 1   cut      53940 non-null  object 
 2   color    53940 non-null  object 
 3   clarity  53940 non-null  object 
 4   depth    53940 non-null  float64
 5   table    53940 non-null  float64
 6   price    53940 non-null  int64  
 7   x        53940 non-null  float64
 8   y        53940 non-null  float64
 9   z        53940 non-null  float64
dtypes: float64(6), int64(1), object(3)
memory usage: 4.1+ MB
In [18]:
df.describe()
Out[18]:
carat depth table price x y z
count 53940.000000 53940.000000 53940.000000 53940.000000 53940.000000 53940.000000 53940.000000
mean 0.797940 61.749405 57.457184 3932.799722 5.731157 5.734526 3.538734
std 0.474011 1.432621 2.234491 3989.439738 1.121761 1.142135 0.705699
min 0.200000 43.000000 43.000000 326.000000 0.000000 0.000000 0.000000
25% 0.400000 61.000000 56.000000 950.000000 4.710000 4.720000 2.910000
50% 0.700000 61.800000 57.000000 2401.000000 5.700000 5.710000 3.530000
75% 1.040000 62.500000 59.000000 5324.250000 6.540000 6.540000 4.040000
max 5.010000 79.000000 95.000000 18823.000000 10.740000 58.900000 31.800000

Missing Values¶

In [19]:
print("Total Missing Values: ",df.isna().sum().sum())
print("Missing Values By Column: \n",df.isna().sum())
Total Missing Values:  0
Missing Values By Column: 
 carat      0
cut        0
color      0
clarity    0
depth      0
table      0
price      0
x          0
y          0
z          0
dtype: int64

Duplicates¶

In [20]:
df.duplicated().sum()
Out[20]:
146
In [21]:
# Drop Duplicates
df.drop_duplicates(inplace=True)
df.duplicated().sum()
Out[21]:
0

Outliers¶

In [22]:
outliers = pd.DataFrame()

for col in df.select_dtypes(include=['number']).columns:
    # quartiles and IQR
    Q1 = df[col].quantile(0.25)
    Q3 = df[col].quantile(0.75)
    IQR = Q3 - Q1

    # lower and upper bounds
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR

    # outliers by column
    col_outliers = df[(df[col] < lower_bound) | (df[col] > upper_bound)]

    if not col_outliers.empty:
        outliers = pd.concat([outliers, col_outliers])

# Show the outliers found
outliers.info()
<class 'pandas.core.frame.DataFrame'>
Index: 8632 entries, 12246 to 51506
Data columns (total 10 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   carat    8632 non-null   float64
 1   cut      8632 non-null   object 
 2   color    8632 non-null   object 
 3   clarity  8632 non-null   object 
 4   depth    8632 non-null   float64
 5   table    8632 non-null   float64
 6   price    8632 non-null   int64  
 7   x        8632 non-null   float64
 8   y        8632 non-null   float64
 9   z        8632 non-null   float64
dtypes: float64(6), int64(1), object(3)
memory usage: 741.8+ KB

Visualization¶

In [23]:
#pair plot
sns.pairplot(df, hue='cut',palette="mako")
plt.show()
No description has been provided for this image
In [24]:
# Dropping outliers
df_cleaned = df.drop(outliers.index)

df_cleaned = df_cleaned.reset_index(drop=True)

df_cleaned.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 47416 entries, 0 to 47415
Data columns (total 10 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   carat    47416 non-null  float64
 1   cut      47416 non-null  object 
 2   color    47416 non-null  object 
 3   clarity  47416 non-null  object 
 4   depth    47416 non-null  float64
 5   table    47416 non-null  float64
 6   price    47416 non-null  int64  
 7   x        47416 non-null  float64
 8   y        47416 non-null  float64
 9   z        47416 non-null  float64
dtypes: float64(6), int64(1), object(3)
memory usage: 3.6+ MB
In [25]:
df_cleaned.describe()
Out[25]:
carat depth table price x y z
count 47416.000000 47416.000000 47416.000000 47416.000000 47416.000000 47416.000000 47416.000000
mean 0.708950 61.785908 57.261517 3112.258984 5.547416 5.552283 3.428830
std 0.371023 1.106205 2.021723 2751.212081 0.979717 0.973817 0.606057
min 0.200000 58.800000 52.000000 326.000000 3.730000 3.680000 1.410000
25% 0.380000 61.100000 56.000000 891.000000 4.640000 4.650000 2.860000
50% 0.600000 61.900000 57.000000 2064.000000 5.440000 5.460000 3.360000
75% 1.010000 62.500000 59.000000 4637.000000 6.410000 6.410000 3.980000
max 2.000000 64.700000 63.500000 11888.000000 8.280000 8.270000 5.300000
In [26]:
sns.pairplot(df_cleaned, hue='cut', palette='mako')
plt.show()
No description has been provided for this image
In [27]:
cat = df_cleaned.select_dtypes(include=['object']).columns
print(cat)
Index(['cut', 'color', 'clarity'], dtype='object')
In [28]:
for col in cat:
    plt.figure(figsize=(10, 5))
    sns.countplot(data=df_cleaned, x=col, palette='mako')
    plt.title(f'Count of {col}')
    plt.xticks(rotation=45)
    plt.show()
C:\Users\HP\AppData\Local\Temp\ipykernel_32068\2978708903.py:3: FutureWarning: 

Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `x` variable to `hue` and set `legend=False` for the same effect.

  sns.countplot(data=df_cleaned, x=col, palette='mako')
No description has been provided for this image
C:\Users\HP\AppData\Local\Temp\ipykernel_32068\2978708903.py:3: FutureWarning: 

Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `x` variable to `hue` and set `legend=False` for the same effect.

  sns.countplot(data=df_cleaned, x=col, palette='mako')
No description has been provided for this image
C:\Users\HP\AppData\Local\Temp\ipykernel_32068\2978708903.py:3: FutureWarning: 

Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `x` variable to `hue` and set `legend=False` for the same effect.

  sns.countplot(data=df_cleaned, x=col, palette='mako')
No description has been provided for this image
In [77]:
# # Violin plots
# for col in cat:
#     plt.figure(figsize=(10, 5))
#     sns.violinplot(data=df_cleaned, x= col, y='price', palette="Blues")
#     plt.title(f'Violin plot of {col} by Price')
#     plt.show()
In [30]:
# Box plots
for col in cat:
    plt.figure(figsize=(10, 5))
    sns.boxplot(data=df_cleaned, x= col, y='price', palette="Blues")
    plt.title(f'Box plot of {col} by Price')
    plt.show()
C:\Users\HP\AppData\Local\Temp\ipykernel_32068\2004675692.py:4: FutureWarning: 

Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `x` variable to `hue` and set `legend=False` for the same effect.

  sns.boxplot(data=df_cleaned, x= col, y='price', palette="Blues")
No description has been provided for this image
C:\Users\HP\AppData\Local\Temp\ipykernel_32068\2004675692.py:4: FutureWarning: 

Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `x` variable to `hue` and set `legend=False` for the same effect.

  sns.boxplot(data=df_cleaned, x= col, y='price', palette="Blues")
No description has been provided for this image
C:\Users\HP\AppData\Local\Temp\ipykernel_32068\2004675692.py:4: FutureWarning: 

Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `x` variable to `hue` and set `legend=False` for the same effect.

  sns.boxplot(data=df_cleaned, x= col, y='price', palette="Blues")
No description has been provided for this image

Exporting Cleaned Data¶

In [31]:
df_cleaned.to_csv('diamonds_cleaned.csv', index=False)

Encoding¶

In [32]:
df_cleaned = pd.read_csv('diamonds_cleaned.csv')
In [33]:
cat = df_cleaned.select_dtypes(include=['object']).columns
In [34]:
# Label Encoding
from sklearn.preprocessing import LabelEncoder
le= LabelEncoder()
In [35]:
labeled_df = df_cleaned.copy()
for col in cat:
    labeled_df[col] = le.fit_transform(labeled_df[col])
labeled_df.head()
Out[35]:
carat cut color clarity depth table price x y z
0 0.23 2 1 3 61.5 55.0 326 3.95 3.98 2.43
1 0.21 3 1 2 59.8 61.0 326 3.89 3.84 2.31
2 0.29 3 5 5 62.4 58.0 334 4.20 4.23 2.63
3 0.31 1 6 3 63.3 58.0 335 4.34 4.35 2.75
4 0.24 4 6 7 62.8 57.0 336 3.94 3.96 2.48
In [36]:
plt.figure(figsize=(12, 8))
corr = labeled_df.corr()
sns.heatmap(corr, annot=True, cmap='coolwarm', fmt='.2f')
plt.show()
No description has been provided for this image

Data Splitting¶

In [37]:
from sklearn.model_selection import train_test_split

X = labeled_df.drop(columns=['price'],axis=1)
y = labeled_df['price']

X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.3, random_state=42)
In [38]:
#scaling
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
In [39]:
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

Modelling¶

In [40]:
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import PolynomialFeatures
from sklearn.tree import DecisionTreeRegressor
#importing decision tree 



from sklearn.pipeline import make_pipeline


from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
from sklearn.model_selection import cross_val_score

Linear Regression¶

In [41]:
model = LinearRegression() 
In [42]:
model.fit(X_train, y_train)
Out[42]:
LinearRegression()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
Parameters
fit_intercept  True
copy_X  True
tol  1e-06
n_jobs  None
positive  False
In [43]:
# Accuracy Training
train_accuracy = model.score(X_train, y_train)
print(f"Training Accuracy: {train_accuracy:.2f}")

# Accuracy Testing
test_accuracy = model.score(X_test, y_test)
print(f"Testing Accuracy: {test_accuracy:.2f}")


y_pred = model.predict(X_test)
train_pred = model.predict(X_train)

# Metrics
r2 = r2_score(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)
rmse = np.sqrt(mse)
cross_val_scores = cross_val_score(model, X_train, y_train, cv=5)
Training Accuracy: 0.89
Testing Accuracy: 0.89
In [44]:
print(f"R2 :{r2 :.4f}")
print(f"mae :{mae :.4f}")
print(f"mse :{mse :.4f}")
print(f"rmse :{rmse :.4f}")
print(f"Cross-validation scores: {cross_val_scores}")
print(f"Mean cross-validation score: {cross_val_scores.mean():.4f}")
R2 :0.8914
mae :610.5716
mse :829496.4105
rmse :910.7669
Cross-validation scores: [0.89286332 0.88849218 0.88771066 0.89109169 0.88591289]
Mean cross-validation score: 0.8892

Polynomial Regression¶

In [45]:
degree = 2
polyreg = make_pipeline(PolynomialFeatures(degree), LinearRegression())
polyreg.fit(X_train, y_train)
Out[45]:
Pipeline(steps=[('polynomialfeatures', PolynomialFeatures()),
                ('linearregression', LinearRegression())])
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
Parameters
steps  [('polynomialfeatures', ...), ('linearregression', ...)]
transform_input  None
memory  None
verbose  False
Parameters
degree  2
interaction_only  False
include_bias  True
order  'C'
Parameters
fit_intercept  True
copy_X  True
tol  1e-06
n_jobs  None
positive  False
In [46]:
# Accuracy Training
poly_train_accuracy = polyreg.score(X_train, y_train)
print(f"Training Accuracy: {poly_train_accuracy:.2f}")

# Accuracy Testing
poly_test_accuracy = polyreg.score(X_test, y_test)
print(f"Testing Accuracy: {poly_test_accuracy:.2f}")


#Predicting
y_pred_poly = polyreg.predict(X_test)
train_pred_poly = polyreg.predict(X_train)
#Metrics
r2_poly = r2_score(y_test, y_pred_poly)
mse_poly = mean_squared_error(y_test, y_pred_poly)
mae_poly = mean_absolute_error(y_test, y_pred_poly)
rmse_poly = np.sqrt(mse_poly)
cross_val_scores_poly = cross_val_score(polyreg, X_train, y_train, cv=5)
Training Accuracy: 0.93
Testing Accuracy: 0.93
In [47]:
train_accuracy_poly = polyreg.score(X_train, y_train)
print(f"Training Accuracy (Polynomial Regression): {train_accuracy_poly:.2f}")
print(f"R2 (Polynomial Regression): {r2_poly:.4f}")
print(f"mae (Polynomial Regression): {mae_poly:.4f}")
print(f"mse (Polynomial Regression): {mse_poly:.4f}")
print(f"rmse (Polynomial Regression): {rmse_poly:.4f}")
print(f"Cross-validation scores (Polynomial Regression): {cross_val_scores_poly}")
print(f"Mean cross-validation score (Polynomial Regression): {cross_val_scores_poly.mean():.4f}")
Training Accuracy (Polynomial Regression): 0.93
R2 (Polynomial Regression): 0.9322
mae (Polynomial Regression): 443.3649
mse (Polynomial Regression): 517720.3447
rmse (Polynomial Regression): 719.5279
Cross-validation scores (Polynomial Regression): [0.93080919 0.9330201  0.92831633 0.93380801 0.93079484]
Mean cross-validation score (Polynomial Regression): 0.9313

Decision Tree¶

In [48]:
parameters = {
    'max_depth': 20,
    'min_samples_leaf': 10,
    'max_features': 5,
    'random_state': 42
    
    }
    
    
treeModel = DecisionTreeRegressor(**parameters)
treeModel.fit(X_train, y_train)
Out[48]:
DecisionTreeRegressor(max_depth=20, max_features=5, min_samples_leaf=10,
                      random_state=42)
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
Parameters
criterion  'squared_error'
splitter  'best'
max_depth  20
min_samples_split  2
min_samples_leaf  10
min_weight_fraction_leaf  0.0
max_features  5
random_state  42
max_leaf_nodes  None
min_impurity_decrease  0.0
ccp_alpha  0.0
monotonic_cst  None
In [49]:
tree_train_accuracy = treeModel.score(X_train, y_train)
print(f"Training Accuracy (Decision Tree): {tree_train_accuracy:.2f}")
tree_test_accuracy = treeModel.score(X_test, y_test)
print(f"Testing Accuracy (Decision Tree): {tree_test_accuracy:.2f}")
Training Accuracy (Decision Tree): 0.98
Testing Accuracy (Decision Tree): 0.97
In [50]:
#Predicting with Decision Tree
y_pred_tree = treeModel.predict(X_test)
# Metrics for Decision Tree
r2_tree = r2_score(y_test, y_pred_tree)
mse_tree = mean_squared_error(y_test, y_pred_tree)
mae_tree = mean_absolute_error(y_test, y_pred_tree)
rmse_tree = np.sqrt(mse_tree)
cross_val_scores_tree = cross_val_score(treeModel, X_train, y_train, cv=5)
In [51]:
print(f"R2 (Decision Tree): {r2_tree:.4f}")
print(f"mae (Decision Tree): {mae_tree:.4f}")
print(f"mse (Decision Tree): {mse_tree:.4f}")
print(f"rmse (Decision Tree): {rmse_tree:.4f}")
print(f"Cross-validation scores (Decision Tree): {cross_val_scores_tree}")
print(f"Mean cross-validation score (Decision Tree): {cross_val_scores_tree.mean():.4f}")
R2 (Decision Tree): 0.9715
mae (Decision Tree): 259.3015
mse (Decision Tree): 217591.4610
rmse (Decision Tree): 466.4670
Cross-validation scores (Decision Tree): [0.97070679 0.97435373 0.96975693 0.97050959 0.96997527]
Mean cross-validation score (Decision Tree): 0.9711

Classification¶

In [63]:
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.tree import DecisionTreeClassifier , plot_tree
from sklearn.metrics import classification_report ,accuracy_score ,recall_score, f1_score , confusion_matrix
from sklearn.svm import SVC
In [64]:
df_cls = df_cleaned.copy()
In [65]:
cut_map = {'Fair': 1, 'Good': 2, 'Very Good': 3, 'Premium': 4, 'Ideal': 5}
color_map = {'J': 1, 'I': 2, 'H': 3, 'G': 4, 'F': 5, 'E': 6, 'D': 7}
clarity_map = {'I1': 1, 'SI2': 2, 'SI1': 3, 'VS2': 4, 'VS1': 5, 'VVS2': 6, 'VVS1': 7, 'IF': 8}
In [66]:
df_cls['cut'] = df_cls['cut'].map(cut_map)
df_cls['color'] = df_cls['color'].map(color_map)
df_cls['clarity'] = df_cls['clarity'].map(clarity_map)
In [67]:
X = df_cls.drop(columns=['cut'])
y = df_cls['cut']
In [68]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42, stratify=y)
In [69]:
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)
In [70]:
models = {
    "KNN": KNeighborsClassifier(n_neighbors=5),
    "Decision Tree": DecisionTreeClassifier(max_depth=5, random_state=42),
    "SVM": SVC(kernel='rbf', random_state=42)
    }
In [71]:
for name, model in models.items():
    print(f"\nTraining {name}...")
    model.fit(X_train_scaled, y_train)
    y_pred = model.predict(X_test_scaled)
    print(f"--- {name} Results ---")
    print(f"Accuracy: {accuracy_score(y_test, y_pred):.4f}")
    print(classification_report(y_test, y_pred, target_names=cut_map.keys(), zero_division=0))
Training KNN...
--- KNN Results ---
Accuracy: 0.6633
              precision    recall  f1-score   support

        Fair       0.56      0.20      0.30        99
        Good       0.54      0.54      0.54      1122
   Very Good       0.42      0.35      0.38      3295
     Premium       0.63      0.65      0.64      3625
       Ideal       0.81      0.87      0.84      6084

    accuracy                           0.66     14225
   macro avg       0.59      0.52      0.54     14225
weighted avg       0.65      0.66      0.65     14225


Training Decision Tree...
--- Decision Tree Results ---
Accuracy: 0.7334
              precision    recall  f1-score   support

        Fair       0.75      0.76      0.75        99
        Good       0.78      0.56      0.65      1122
   Very Good       0.67      0.31      0.43      3295
     Premium       0.64      0.85      0.73      3625
       Ideal       0.81      0.92      0.86      6084

    accuracy                           0.73     14225
   macro avg       0.73      0.68      0.68     14225
weighted avg       0.73      0.73      0.71     14225


Training SVM...
--- SVM Results ---
Accuracy: 0.7466
              precision    recall  f1-score   support

        Fair       0.92      0.12      0.21        99
        Good       0.70      0.54      0.61      1122
   Very Good       0.61      0.45      0.52      3295
     Premium       0.72      0.81      0.76      3625
       Ideal       0.81      0.92      0.86      6084

    accuracy                           0.75     14225
   macro avg       0.75      0.57      0.59     14225
weighted avg       0.74      0.75      0.73     14225

plotting¶

In [72]:
plt.figure(figsize=(30, 20))
plot_tree(models["Decision Tree"], 
          feature_names=X.columns, 
          class_names=list(cut_map.keys()), 
          filled=True, 
          rounded=True, 
          fontsize=10,
          max_depth=4) 
plt.title("Decision Tree Visualization (Truncated to Depth 4)")
plt.show()
No description has been provided for this image
In [73]:
subset = df_cls.sample(n=1000, random_state=42)
X_viz = subset[['table', 'depth']].values
y_viz = subset['cut'].values

scaler_viz = StandardScaler()
X_viz_scaled = scaler_viz.fit_transform(X_viz)


svm_viz = SVC(kernel='rbf', C=1.0, gamma='auto', random_state=42)
svm_viz.fit(X_viz_scaled, y_viz)

h = .02  
x_min, x_max = X_viz_scaled[:, 0].min() - 1, X_viz_scaled[:, 0].max() + 1
y_min, y_max = X_viz_scaled[:, 1].min() - 1, X_viz_scaled[:, 1].max() + 1
xx, yy = np.meshgrid(np.arange(x_min, x_max, h),
                     np.arange(y_min, y_max, h))


Z = svm_viz.predict(np.c_[xx.ravel(), yy.ravel()])
Z = Z.reshape(xx.shape)


plt.figure(figsize=(10, 8))
plt.contourf(xx, yy, Z, alpha=0.8, cmap=plt.cm.coolwarm)


scatter = plt.scatter(X_viz_scaled[:, 0], X_viz_scaled[:, 1], c=y_viz, 
                      cmap=plt.cm.coolwarm, edgecolors='k', s=20)

plt.xlabel('Table (Scaled)')
plt.ylabel('Depth (Scaled)')
plt.title('SVM Decision Boundary (Table vs Depth)')
plt.legend(*scatter.legend_elements(), title="Cut Quality")
plt.show()
No description has been provided for this image

UI¶

In [74]:
import joblib


final_model = models["Decision Tree"] 
final_scaler = scaler 

joblib.dump(final_model, 'diamond_model.pkl')
joblib.dump(final_scaler, 'diamond_scaler.pkl')

print("✅ Success! Model and Scaler have been saved.")
✅ Success! Model and Scaler have been saved.
In [79]:
print("🔄 Saving your custom Decision Tree for the App...")

if 'df_cleaned' in locals():
    df_use = df_cleaned.copy()
else:
    import seaborn as sns
    df_use = sns.load_dataset('diamonds') 

# Mapping
cut_map = {'Fair': 1, 'Good': 2, 'Very Good': 3, 'Premium': 4, 'Ideal': 5}
color_map = {'J': 1, 'I': 2, 'H': 3, 'G': 4, 'F': 5, 'E': 6, 'D': 7}
clarity_map = {'I1': 1, 'SI2': 2, 'SI1': 3, 'VS2': 4, 'VS1': 5, 'VVS2': 6, 'VVS1': 7, 'IF': 8}

for col, mapper in zip(['cut', 'color', 'clarity'], [cut_map, color_map, clarity_map]):
    if df_use[col].dtype == 'object':
        df_use[col] = df_use[col].map(mapper)

X = df_use.drop(columns=['price'])

y = np.log(df_use['price']) 

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)

treeModel = DecisionTreeRegressor(max_depth=20, min_samples_leaf=10, max_features=5, random_state=42)
treeModel.fit(X_train_scaled, y_train)

joblib.dump(treeModel, 'diamond_price_model.pkl')
joblib.dump(scaler, 'diamond_price_scaler.pkl')

print("✅ Success!")
🔄 Saving your custom Decision Tree for the App...
✅ Success!